
/*
Clean data on number of COVID-19 cases and deaths by county and day
*/

* write short program to format the date of the John Hopkins data
cap program drop format_date_jh
program define format_date_jh
split date, parse("_")
drop date
destring date*, replace
ren date1 month
ren date2 day
ren date3 year
foreach v in month day {
	gen `v'_str = string(`v')
	replace `v'_str = "0" + `v'_str if `v' <10
	}
replace year = year + 2000
tostring year, gen(year_str)
gen date = year_str + "-" + month_str + "-" + day_str

drop *_str month day year
end

* data from John Hopkins: https://github.com/CSSEGISandData/COVID-19

* cases US
import delimited "${data_raw_cases}/time_series_covid19_confirmed_US.csv", clear
ren long_ lon

foreach var of varlist v* {
	local lab: variable label `var' 
	local lab_clean = regexr("`lab'", "/", "_")
	local lab_clean = regexr("`lab_clean'", "/", "_")
	local lab_clean = regexr("`lab_clean'", "/", "_")
	local label "cases_`lab_clean'"
	ren `var' `label'
}

tempfile cases_us
save `cases_us'


* deaths US 
import delimited "${data_raw_cases}/time_series_covid19_deaths_US.csv",clear
ren long_ lon

foreach var of varlist v* {
	local lab: variable label `var' 
	local lab_clean = regexr("`lab'", "/", "_")
	local lab_clean = regexr("`lab_clean'", "/", "_")
	local lab_clean = regexr("`lab_clean'", "/", "_")
	local label "deaths_`lab_clean'"
	ren `var' `label'
}

* combine with cases
merge 1:1 fips admin2 using `cases_us', assert(3) nogen 

* reshape long on date
greshape long deaths_ cases_, i(fips admin2) j(date) string
ren (deaths_ cases_) (deaths_jh cases_jh)

format_date_jh

ren (admin2 province_state) (county state)
replace county = "Unknown" if county ==""

gen fips_jh = fips 
keep fips county state date *_jh

tempfile usa_jh
save `usa_jh'


* NY times data (only for the US) https://github.com/nytimes/covid-19-data
import delimited "${data_raw_cases}/us-counties.csv", clear
replace fips = 2164 if fips == 2997
rename (cases deaths) (cases_nyt deaths_nyt)

* combine with John Hopkins data
merge 1:1 state county date using `usa_jh', nogen

assert fips == fips_jh if fips !=. & fips_jh !=.
replace fips = fips_jh if fips ==. 
drop fips_jh

* these are a handful of small boroughs in Alaska who have changed their
* administrative structure midway through the same (but since we originally
* ran the code using the data from 2020-09-10). We'll drop them just to keep
* consistency with the version used in the original paper.
drop if inlist(fips, 2063, 2066, 2998)
keep if date <= "2020-09-10"

* save
save "${data_derived_cases}/usa_cases_deaths_jh_nyt.dta", replace
export delimited "${data_derived_cases}/usa_cases_deaths_jh_nyt.csv", replace

* make it unique on county
drop if fips == . 
collapse (first) county state (mean) cases_* deaths_*, by(fips date)

save "${data_derived_cases}/usa_cases_deaths_jh_nyt_unique.dta", replace
export delimited "${data_derived_cases}/usa_cases_deaths_jh_nyt_unique.csv", replace

